Load packages and data
library("phyloseq"); packageVersion("phyloseq")
## [1] '1.30.0'
data(GlobalPatterns)
library("ggplot2"); packageVersion("ggplot2")
## [1] '3.3.2'
library("plyr"); packageVersion("plyr")
## [1] '1.8.6'
Change theme for all future plots to theme_bw
theme_set(theme_bw())
There are tools in the phyloseq package for preprocessing data. You can see examples and details of preprocessing in this dedicated preprocessing tutorial.
When preprocessing you must think hard about the preprocessing that you do and be able to explain why you did them. The steps you took during preprocessing should be clearly documented.
We will preprocess by first filtering low-occurrence, poorly-represented OTUs from the data. Below, the OTUs are removed and indicated not to appear more than 5 times in more than half the samples.
GP = GlobalPatterns
wh0 = genefilter_sample(GP, filterfun_sample(function(x) x > 5), A=0.5*nsamples(GP))
GP1 = prune_taxa(wh0, GP)
Next, we will transform the data to even the sampling depth.
GP1 = transform_sample_counts(GP1, function(x) 1E6 * x/sum(x))
Next, we keep only the top five most abundant phyla.
phylum.sum = tapply(taxa_sums(GP1), tax_table(GP1)[, "Phylum"], sum, na.rm=TRUE)
top5phyla = names(sort(phylum.sum, TRUE))[1:5]
GP1 = prune_taxa((tax_table(GP1)[, "Phylum"] %in% top5phyla), GP1)
The preprocessing still leaves us with 204 OTUs. A major problem here is that some are human-associated microbiomes and some are not. Next, we will define human-associated versus non-human.
human = get_variable(GP1, "SampleType") %in% c("Feces", "Mock", "Skin", "Tongue")
sample_data(GP1)$human <- factor(human)
The plot_ordination() function supports four basic representations of an ordination.
We’ll start by plotting just the OTUs and coloring the points by Phylum.
GP.ord <- ordinate(GP1, "NMDS", "bray")
## Square root transformation
## Wisconsin double standardization
## Run 0 stress 0.1333468
## Run 1 stress 0.1449078
## Run 2 stress 0.1385324
## Run 3 stress 0.1684976
## Run 4 stress 0.1570722
## Run 5 stress 0.168749
## Run 6 stress 0.1453281
## Run 7 stress 0.1385323
## Run 8 stress 0.1577221
## Run 9 stress 0.166209
## Run 10 stress 0.1644737
## Run 11 stress 0.1465222
## Run 12 stress 0.1333468
## ... New best solution
## ... Procrustes: rmse 1.807537e-06 max resid 4.642766e-06
## ... Similar to previous best
## Run 13 stress 0.1642865
## Run 14 stress 0.1453585
## Run 15 stress 0.1465222
## Run 16 stress 0.1510167
## Run 17 stress 0.1782388
## Run 18 stress 0.1333468
## ... Procrustes: rmse 1.321953e-06 max resid 3.415021e-06
## ... Similar to previous best
## Run 19 stress 0.2835701
## Run 20 stress 0.1647386
## *** Solution reached
p1 = plot_ordination(GP1, GP.ord, type="taxa", color="Phylum", title="taxa")
print(p1)
This is a complicated looking plot, which is not necessarily a good thing. We can use facet_wrap() function to simplify the plot.
p1 + facet_wrap(~Phylum, 3)
Next, we’ll plot only the samples and color the points by sample type while also modifying the shape according to whether they are human-associated or not.
p2 = plot_ordination(GP1, GP.ord, type="samples", color="SampleType", shape="human")
p2 + geom_polygon(aes(fill=SampleType)) + geom_point(size=5) + ggtitle("samples")
The plot_ordination() function also allows you to be able to plot both samples and OTUs on the same graph.
p3 = plot_ordination(GP1, GP.ord, type="biplot", color="SampleType", shape="Phylum", title="biplot")
# Some stuff to modify the automatic shape scale
GP1.shape.names = get_taxa_unique(GP1, "Phylum")
GP1.shape <- 15:(15 + length(GP1.shape.names) - 1)
names(GP1.shape) <- GP1.shape.names
GP1.shape["samples"] <- 16
p3 + scale_shape_manual(values=GP1.shape)
And, of course, we can simplify this graph (reduce occlusion) by using the argument type=“split” instead of the facet_wrap approach.
p4 = plot_ordination(GP1, GP.ord, type="split", color="Phylum", shape="human", label="SampleType", title="split")
p4
Here we try different method parameter options to the plot_ordination() function.
dist = "bray"
ord_meths = c("DCA", "CCA", "RDA", "DPCoA", "NMDS", "MDS", "PCoA")
plist = llply(as.list(ord_meths), function(i, physeq, dist){
ordi = ordinate(physeq, method=i, distance=dist)
plot_ordination(physeq, ordi, "samples", color="SampleType")
}, GP1, dist)
## Square root transformation
## Wisconsin double standardization
## Run 0 stress 0.1333468
## Run 1 stress 0.1385323
## Run 2 stress 0.1469145
## Run 3 stress 0.1690982
## Run 4 stress 0.1470663
## Run 5 stress 0.1746122
## Run 6 stress 0.1684972
## Run 7 stress 0.147642
## Run 8 stress 0.168709
## Run 9 stress 0.2566499
## Run 10 stress 0.1570722
## Run 11 stress 0.1529609
## Run 12 stress 0.1850505
## Run 13 stress 0.148501
## Run 14 stress 0.1565314
## Run 15 stress 0.157975
## Run 16 stress 0.1564653
## Run 17 stress 0.146027
## Run 18 stress 0.1333468
## ... New best solution
## ... Procrustes: rmse 5.484805e-06 max resid 1.584682e-05
## ... Similar to previous best
## Run 19 stress 0.1385323
## Run 20 stress 0.1529686
## *** Solution reached
Store the ordination methods plot results in a list.
names(plist) <- ord_meths
Next, we extract the data from each of the individual plots and put the extracted data into one big dataframe so that we can include all plots in one graphic.
pdataframe = ldply(plist, function(x){
df = x$data[, 1:2]
colnames(df) = c("Axis_1", "Axis_2")
return(cbind(df, x$data))
})
names(pdataframe)[1] = "method"
Next, we can use our dataframe to make a standard faceted ggplot scatterplot.
p = ggplot(pdataframe, aes(Axis_1, Axis_2, color=SampleType, shape=human, fill=SampleType))
p = p + geom_point(size=4) + geom_polygon()
p = p + facet_wrap(~method, scales="free")
p = p + scale_fill_brewer(type="qual", palette="Set1")
p = p + scale_colour_brewer(type="qual", palette="Set1")
p
To replot a larger version of an individual plot, we can print the plist from which the pdataframe was made. Here, we plot the second element in the list.
plist[[2]]
We can add some additional layers to improve the aesthetics of the plot.
p = plist[[2]] + scale_colour_brewer(type="qual", palette="Set1")
p = p + scale_fill_brewer(type="qual", palette="Set1")
p = p + geom_point(size=5) + geom_polygon(aes(fill=SampleType))
p
Next, we can use the ordinate() function to simultaneously perform weighted UniFrac and perform a Principal Component Analysis on the distance matrix. Then, we can pass the data and the ordination results to plot_ordination to create a default ggplot2 graphic.
ordu = ordinate(GP1, "PCoA", "unifrac", weighted=TRUE)
plot_ordination(GP1, ordu, color="SampleType", shape="human")
Then, we add layers to improve the aesthetics of the plot.
p = plot_ordination(GP1, ordu, color="SampleType", shape="human")
p = p + geom_point(size=7, alpha=0.75)
p = p + scale_colour_brewer(type="qual", palette="Set1")
p + ggtitle("MDS/PCoA on weighted-UniFrac distance, GlobalPatterns")
Load packages and GlobalPatterns data
library("phyloseq"); packageVersion("phyloseq")
## [1] '1.30.0'
data("GlobalPatterns")
library("ggplot2"); packageVersion("ggplot2")
## [1] '3.3.2'
Set the ggplot themes
theme_set(theme_bw())
pal = "Set1"
scale_colour_discrete <- function(palname=pal, ...){
scale_colour_brewer(palette=palname, ...)
}
scale_fill_discrete <- function(palname=pal, ...){
scale_fill_brewer(palette=palname, ...)
}
First, we prune the OTUs that are not present in any of the samples. Although it is tempting to trim noise right away, many richness estimates are modeled on singletons and doubletons in the abundance data. Therefore, it is important to leave the noise in the dataset if you want a meaningful estimate.
GP <- prune_species(speciesSums(GlobalPatterns) > 0, GlobalPatterns)
## Warning: 'prune_species' is deprecated.
## Use 'prune_taxa' instead.
## See help("Deprecated") and help("phyloseq-deprecated").
## Warning: 'speciesSums' is deprecated.
## Use 'taxa_sums' instead.
## See help("Deprecated") and help("phyloseq-deprecated").
Here, we plot the default graphic produced by plot_richness() function.
plot_richness(GP)
Next, we specify a measures argument to the plot_richness() function. This will include the alpha-diversity measures that we’re interested in (i.e. Chao1 and Shannon).
plot_richness(GP, measures=c("Chao1", "Shannon"))
Next, we specify a sample variable on which to group/organize samples along the x axis. In this case, we use the SampleType variable.
plot_richness(GP, x="SampleType", measures=c("Chao1", "Shannon"))
If we wanted to use an external variable in the plot that isn’t in the GP dataset already (e.g. such as human-associated versus not human-associated), we first have to define this new variable as a factor.
sampleData(GP)$human <- getVariable(GP, "SampleType") %in% c("Feces", "Mock", "Skin", "Tongue")
## Warning: 'getVariable' is deprecated.
## Use 'get_variable' instead.
## See help("Deprecated") and help("phyloseq-deprecated").
## Warning: 'sampleData' is deprecated.
## Use 'sample_data' instead.
## See help("Deprecated") and help("phyloseq-deprecated").
## Warning: 'sampleData<-' is deprecated.
## Use 'sample_data<-' instead.
## See help("Deprecated") and help("phyloseq-deprecated").
Then, we tell the plot_richness() function to map the new variable on the x axis and color the points according to the SampleType they belong to.
plot_richness(GP, x="human", color="SampleType", measures=c("Chao1", "Shannon"))
If we wanted to merge samples from the SampleType environment and we would use the merge_samples() function to merge the samples.
GPst = merge_samples(GP, "SampleType")
# repair variables that were damaged during merge (coerced to numeric)
sample_data(GPst)$SampleType <- factor(sample_names(GPst))
sample_data(GPst)$human <- as.logical(sample_data(GPst)$human)
Then, we plot the environment-merged version of the data, and add a geom_point() layer to change the size and transparency of the points.
p = plot_richness(GPst, x="human", color="SampleType", measures=c("Chao1", "Shannon"))
p + geom_point(size=5, alpha=0.7)
To remove the original layer points, first we check which lists are present in p.
p$layers
## [[1]]
## geom_point: na.rm = TRUE
## stat_identity: na.rm = TRUE
## position_identity
##
## [[2]]
## mapping: ymax = ~value + se, ymin = ~value - se
## geom_errorbar: na.rm = FALSE, orientation = NA, width = 0.1, width = 0.1, flipped_aes = FALSE
## stat_identity: na.rm = FALSE
## position_identity
We can see that the first layer is the one specifying the original points. Next, we use negative indexing to remove that layer and add a new geom_point with larger point size.
p$layers <- p$layers[-1]
p + geom_point(size=5, alpha=0.7)
library("phyloseq"); packageVersion("phyloseq")
## [1] '1.30.0'
library("ggplot2"); packageVersion("ggplot2")
## [1] '3.3.2'
theme_set(theme_bw())
data("GlobalPatterns")
gpt <- subset_taxa(GlobalPatterns, Kingdom=="Bacteria")
gpt <- prune_taxa(names(sort(taxa_sums(gpt),TRUE)[1:300]), gpt)
plot_heatmap(gpt, sample.label="SampleType")
## Warning: Transformation introduced infinite values in discrete y-axis
gpac <- subset_taxa(GlobalPatterns, Phylum=="Crenarchaeota")
plot_heatmap(gpac)
## Warning: Transformation introduced infinite values in discrete y-axis
(p <- plot_heatmap(gpac, "NMDS", "bray", "SampleType", "Family"))
## Warning: Transformation introduced infinite values in discrete y-axis
p$scales$scales[[1]]$name <- "My X-Axis"
p$scales$scales[[2]]$name <- "My Y-Axis"
print(p)
## Warning: Transformation introduced infinite values in discrete y-axis
plot_heatmap(gpac, "NMDS", "bray", "SampleType", "Family", low="#000033", high="#CCFF66")
## Warning: Transformation introduced infinite values in discrete y-axis
plot_heatmap(gpac, "NMDS", "bray", "SampleType", "Family", low="#000033", high="#FF3300")
## Warning: Transformation introduced infinite values in discrete y-axis
plot_heatmap(gpac, "NMDS", "bray", "SampleType", "Family", low="#000033", high="#66CCFF")
## Warning: Transformation introduced infinite values in discrete y-axis
plot_heatmap(gpac, "NMDS", "bray", "SampleType", "Family", low="#66CCFF", high="#000033", na.value="white")
## Warning: Transformation introduced infinite values in discrete y-axis
plot_heatmap(gpac, "NMDS", "bray", "SampleType", "Family", low="#FFFFCC", high="#000033", na.value="white")
## Warning: Transformation introduced infinite values in discrete y-axis
plot_heatmap(gpac, "NMDS", "jaccard")
## Warning: Transformation introduced infinite values in discrete y-axis
plot_heatmap(gpac, "DCA", "none", "SampleType", "Family")
## Warning: Transformation introduced infinite values in discrete y-axis
plot_heatmap(gpac, "RDA", "none", "SampleType", "Family")
## Warning: Transformation introduced infinite values in discrete y-axis
plot_heatmap(gpac, "PCoA", "bray", "SampleType", "Family")
## Warning: Transformation introduced infinite values in discrete y-axis
plot_heatmap(gpac, "PCoA", "unifrac", "SampleType", "Family")
## Warning: Transformation introduced infinite values in discrete y-axis
plot_heatmap(gpac, "MDS", "unifrac", "SampleType", "Family", weighted=TRUE)
## Warning: Transformation introduced infinite values in discrete y-axis
heatmap(otu_table(gpac))
library(phyloseq); packageVersion("phyloseq")
## [1] '1.30.0'
packageVersion("ggplot2")
## [1] '3.3.2'
data(enterotype)
set.seed(711L)
enterotype = subset_samples(enterotype, !is.na(Enterotype))
plot_net(enterotype, maxdist = 0.4, point_label = "Sample_ID")
plot_net(enterotype, maxdist = 0.3, color = "SeqTech", shape="Enterotype")
ig <- make_network(enterotype, max.dist=0.3)
plot_network(ig, enterotype)
## Warning: attributes are not identical across measure variables; they will be
## dropped
plot_network(ig, enterotype, color="SeqTech", shape="Enterotype", line_weight=0.4, label=NULL)
## Warning: attributes are not identical across measure variables; they will be
## dropped
ig <- make_network(enterotype, max.dist=0.2)
plot_network(ig, enterotype, color="SeqTech", shape="Enterotype", line_weight=0.4, label=NULL)
## Warning: attributes are not identical across measure variables; they will be
## dropped
ig <- make_network(enterotype, dist.fun="bray", max.dist=0.3)
plot_network(ig, enterotype, color="SeqTech", shape="Enterotype", line_weight=0.4, label=NULL)
## Warning: attributes are not identical across measure variables; they will be
## dropped